***********************************************************
* Internal migration and crime in Brazil *
* Author: Eva-Maria Egger 

* Contact: egger@wider.unu.edu
***********************************************************

* This do-file 
	* produces the summary statistics and descriptive figures. (starts at line 33 )
	* runs the main estimations of the paper at the Microregiao level. (starts at line 78 )
	* Then it runs the analysis of channels at Microregiao and individual level. (starts at line 184 )
	* Last it runs all robustness checks. (starts at line 376 )

*If you wish to replicate a specific table or figure, you can search for "Table X" or "Figure X" (insert the table or figure number instead of X)
	
***********************************************************
** Set directories and globals

global tables
global graphs 

***********************************************************
* Data and do-files used:

	*INPUTS: 
		*"MR_analysis.dta"
		*"Individuals_analysis.dta"
		*"MR_GPSStests_data.dta"
		*"MR_GPSS_data"
		
***********************************************************
***** SUMMARY STATISTICS ******
***********************************************************
** Table 1,  Figure 5, Figure 6 and Figure 7
***********************************************************

use "MR_analysis", clear
xtset micregion year

//replace missing values of migration rates to zero:
recode r_inD_ .=0 if year>=2004 & year!=2010


* Table 1: Summary statistics of in-migration and homicide rates
sum homicrate_ if year>=2005 & l.r_inD_!=.
sum r_inD_ if year>=2004 & year!=2010 & f.homicrate_!=.


* Figure 5: Relationship between in-migration and homicide rates
tw scatter homicrate_ l.r_inD_ if year>=2005 || lfit homicrate_ l.r_inD_ if year>=2005, ///
	title("Linear fit of homicide rate on in-migration rate") xtitle(In-migration rate) ytitle(Homicide rate) legend(lab (1 "Homicide rate"))
graph export "$graphs/MR_lfit_homicides_migration.pdf", replace


* Figure 6 and Figure 7: Maps of in-migration and homicide rates

	*Get average in-migration rates over study period
bys micregion: egen inmig_mean=mean(r_inD_) if year>=2004 & year<=2009
	*Get average homicide rates over study period
bys micregion: egen homic_mean=mean(homicrate_) if year>=2005

keep if year==2009

spmap inmig_mean using "MR_coord", id(id) clm(q) cln(6) ///
		ocolor(none ..) fcolor(Greys) ndfcolor(white) ndocolor(none ..) ///
		title("In-migration rates, 2005 to 2010" , size(msmall)) ///
		 legtitle("Microregiões, 6 quantiles")
	graph export "$graphs\MR_map_inmigration.pdf", replace

spmap homic_mean using "MR_coord", id(id) clm(q) cln(6) ///
		ocolor(none ..) fcolor(Greys) ndfcolor(white) ndocolor(none ..) ///
		title("Homicide rates, 2005 to 2010" , size(msmall)) ///
		 legtitle("Microregiões, 6 quantiles")
	graph export "$graphs\MR_map_homicides.pdf", replace

***********************************************************
***** MAIN ANAlYSIS ******
***********************************************************
** Table 2, Table 3 and Table 4
***********************************************************

* Table 2 and Table 4: Main regressions ******
// 2SLS //
***Fixed effects model with cluster-robust s.e. with year dummies
//log-log of rates (inmigrants per 100000 on homicides per 100000)

use "MR_analysis", clear
xtset micregion year

estimates clear
 
 *OLS
 reghdfe l_homicrate_ l.l_r_inD_ l.zMIVE l_population_ [aw = population_] , absorb(micregion DYear) cluster(micregion) nocons
	estadd scalar R2=e(r2)
	estadd scalar Ftest=e(F)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "No"
	eststo M_ols
	*OLS with state-year FEs
 reghdfe l_homicrate_ l.l_r_inD_ l.zMIVE l_population_ [aw = population_] , absorb(micregion DYear uf_year) cluster(micregion) nocons
	estadd scalar R2=e(r2)
	estadd scalar Ftest=e(F)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_olsFE
		
*define analysis sample for all following regressions:
	g samp=e(sample)

	*Reduced Form 
reghdfe l_homicrate_ l.MIVE_INDall_ l.zMIVE l_population_ if samp==1 [aw = population_] , absorb(micregion DYear ) cluster(micregion) nocons
	estadd scalar R2=e(r2)
	estadd scalar Ftest=e(F)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "No"
	eststo M_RF
	*RF with state-year FEs
reghdfe l_homicrate_ l.MIVE_INDall_ l.zMIVE l_population_ if samp==1  [aw = population_] , absorb(micregion DYear uf_year )  cluster(micregion) nocons
	estadd scalar R2=e(r2)
	estadd scalar Ftest=e(F) 
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_RFFE
	*1st stage
reghdfe l.l_r_inD_ l.MIVE_INDall_ l.zMIVE l_population_ [aw = population_] if samp==1, absorb(micregion DYear ) cluster(micregion) nocons
	estadd scalar R2=e(r2)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "No"
	eststo F_MIVED	
	*1st stage with state-year FEs
reghdfe l.l_r_inD_ l.MIVE_INDall_ l.zMIVE l_population_ [aw = population_] if samp==1, absorb(micregion DYear uf_year ) cluster(micregion) nocons
	estadd scalar R2=e(r2)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo F_MIVEDuf
	*2SLS
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear )cluster(micregion) nocons
	estadd scalar R2=e(r2_a)
	estadd scalar Ftest=e(F)
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "No"
	eststo MIVED
	*2SLS with state-year FEs
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar R2=e(r2_a)
	estadd scalar Ftest=e(F)
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo MIVEDuf

g mains=e(sample)

estout M* using "$tables\tab2_tab4_2ndstage.txt", cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage, first difference) postfoot(Weighted by microregion population. S.E. clustered at microregiao level.) stats(fe years ufs N R2 Finsttest N_clust, labels("Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "R-squared" "Kleibergen-Paap F-test" "Number of clusters") fmt(%8.0f %8.0g %8.0g %8.3f %8.3f %8.0f)) starlevels(* .10 ** .05 *** .01) replace

estout F_M* using "$tables\tab2_tab4_firststage.txt", cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: First stage, first difference) postfoot(Weighted by microregion population. S.E. clustered at microregiao level.) stats(fe years ufs N R2 N_clust, labels("Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "R squared" "Number of clusters") fmt(%8.0f %8.0g %8.0g %8.3f %8.0f)) starlevels(* .10 ** .05 *** .01) replace

***********************************************************
*** Table 3: Test for economic impacts of local labor demand shocks at origin
estimates clear

foreach v in l_gdp_ l_averagemonthlywage l_numemp Mavgwage{
	reghdfe `v' l.zMIVE l_population_ [aw = population_] if mains==1 , absorb(micregion DYear uf_year ) cluster(micregion) nocons
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eret2 local xs "Yes"
	eststo LLS_`v'
}
estout LLS_* using "$tables\tab3_MR_LLS.txt", cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) drop(l_population_) legend label title(Local labor demand shocks) postfoot(Weighted by microregion population. S.E. clustered at microregiao level. Controls include population.) stats(xs fe years ufs N r2 N_clust, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "R-squared" "Number of clusters") fmt(%8.0f %8.0f %8.0f %8.0f %8.3f %8.0f)) starlevels(* .10 ** .05 *** .01) replace 
 

***********************************************************
***** CHANNELS ******
***********************************************************
** Table 5, Table 9, Table 6 and Table 7
***********************************************************

* Table 5: Testing for LATE: Run Probit of likelihood of specific characteristic (low education, female, young, young and male, multiple migrant spells) for each migration year on migrant sample

	*comp_mig refers to migrants who have moved more than once within past five years to inform footnote 3.
	
use "Individuals_analysis.dta", clear

forv y=2004/2009{
	recode o_MIVE_`y' .=0
	lab var o_MIVE_`y' "`y'"
}	

estimates clear
foreach v in loweduc sex young ymale comp_mig{
	xi: probit `v' o_MIVE_2009 i.uf_ori if time_MC==0, robust
		margins , dydx(o_MIVE_2009) 
		eststo M09_`v'
	xi: probit `v' o_MIVE_2008 i.uf_ori if time_MC==1 , robust
		margins , dydx(o_MIVE_2008) 
		eststo M08_`v': mfx
	xi: probit `v' o_MIVE_2007 i.uf_ori if time_MC==2 , robust
		margins , dydx(o_MIVE_2007) 
		eststo M07_`v': mfx
	xi: probit `v' o_MIVE_2006 i.uf_ori if time_MC==3 , robust
		margins , dydx(o_MIVE_2006) 
		eststo M06_`v': mfx
	xi: probit `v' o_MIVE_2005 i.uf_ori if time_MC==4 , robust
		margins , dydx(o_MIVE_2005) 
		eststo M05_`v': mfx
	if `v'!=comp_mig{
		xi: probit `v' o_MIVE_2004 i.uf_ori if time_MC==5 , robust
			margins , dydx(o_MIVE_2004) 
			eststo M04_`v': mfx
	}
}
 
estout M* using "$tables\Individuals_LATE.txt", ///
		cells(b(star fmt(%9.3f)) se(par)) ///
		 label wrap indicate(State dummies = _Iuf*) ///
		 stats(N chi2 , labels("Observations" "Chi-squared") fmt(%8.0f %8.3g)) ///
		 legend drop(_cons) collabels(none) margin replace

** Table 9: Probit of likelihood of specific characteristic (low education, female, young, young and male) for each migration year and different distance cut-offs on migrant sample
		 		 
estimates clear
local c 5
forv y=2004/2009{
	foreach v in loweduc sex young ymale{
		foreach d in 100 500 1000 1500{
			xi: probit `v' o_MIVE_`y' i.uf_ori if time_MC==`c' & migD`d'==1 , robust
			margins , dydx(o_MIVE_`y') 
			eststo M`y'_`v'_D`d': mfx
		}
	}
	local --c
}
local c 4
forv y=2005/2009{
	foreach v in comp_mig{
		foreach d in 100 500 1000 1500{
			xi: probit `v' o_MIVE_`y' i.uf_ori if time_MC==`c' & migD`d'==1 , robust
			margins , dydx(o_MIVE_`y') 
			eststo M`y'_`v'_D`d': mfx
		}
	}
	local --c
}

estout M* using "$tables\Individuals_LATE_distance.txt", ///
		cells(b(star fmt(%9.3f)) se(par)) label wrap indicate(State dummies = _Iuf*) ///
		 stats(N chi2 , labels("Observations" "Chi-squared") fmt(%8.0f %8.3g)) ///
		 legend drop(_cons) collabels(none) margin replace
 
		 
***********************************************************		 
** Table 6: Channel at MR level: impact of immigration on other labor market variables: employment / wages

use "MR_analysis" , clear

xtset micregion year
estimates clear

foreach v in unemployed_ formal_ lhwage_ ymaleUE_ { 
	ivreghdfe `v' l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_]  , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_`v'	
}
	*get main result for PNAD sample:
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_]  if e(sample) , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	estadd scalar Finsttest=e(widstat)
	eststo M_crime_PNAD	
	
estout M_crime_PNAD M_unemployed_ M_formal_ M_lhwage_ M_ymaleUE_ using "$tables\tab6_MRchannels.txt", ///
cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage) postfoot(Weighted by microregion population; S.E. clustered at microregiao level.) stats(xs fe years ufs N Finsttest, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "Kleibergen-Paap F-test") fmt(%8.0g %8.0g %8.0g %8.1f)) starlevels(* .10 ** .05 *** .01) replace

	** to compute contribution of young, unemployed men to total effect:
		*1. get average share of young, unemployed male in sample:
		sum ymaleUE_ if e(sample)
	dis 0.023/r(mean)  // =3.3 % increase in young unemployed men
	dis 3.3*0.33 // 0.33 is elasticity from Britto et al. (2020)
	
***********************************************************

*** Table 7: Channel: high or low informality / high or low crime inertia
use "MR_analysis" , clear
xtset micregion year

*gen formal sector size by year using number of jobs from RAIS:
g sh_formal = numemp/population_
** Heterogeneity: Sub-samples by size of informality/inequality/unemployment etc.
// Homicide rate in 2000 
recode homic2000 0=.
g homic2004=.
	bysort micregion: replace homic2004 = homicrate_[4]
// Formal sector size in 2004 
g sh_formal2004=.
	bysort micregion: replace sh_formal2004 = sh_formal[4]
g sh_formalPNAD=.
	bysort micregion: replace sh_formalPNAD = formal_[4]

*sub-samples: above and below median of each variable
foreach v in homic2000_ sh_formal2004 sh_formal sh_formalPNAD{
	sum `v' , d
	g med_`v' =  r(p50)
	g `v'D = (`v'>=med_`v' & `v'!=.)
}

//Note: Table 7 in paper presents results using sh_formal2004 to define high and low informality. Footnote 13 refers to the other two possible ways to define high or low informality using sh_formal and sh_formalPNAD.

xtset micregion year
estimates clear

	*Table 7 columns 1 to 4
//2SLS Main regression: *by sub-sample (> than median)
foreach v in homic2000_ sh_formal2004 sh_formal sh_formalPNAD {
	ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] if `v'D==1, absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M1_`v'	
	ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] if `v'D==0, absorb(micregion DYear uf_year ) cluster(micregion) nocons 		
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M0_`v'
}

estout M* using "$tables\tab7_col1-4_heterogeneity.txt", ///
cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage) postfoot(Weighted by microregion population; S.E. clustered at microregiao level.) stats(xs fe years ufs N Finsttest, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "Kleibergen-Paap F-test") fmt(%8.0g %8.0g %8.0g %8.3f)) starlevels(* .10 ** .05 *** .01) replace

*Table 7 columns 5 to 8: Labor market impacts by informal sector size:

xtset micregion year
estimates clear

foreach v in formal_ lhwage_{ 
	ivreghdfe `v' l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) if sh_formal2004D==1 [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo Mf1_`v'	
	ivreghdfe `v' l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) if sh_formal2004D==0 [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo Mf0_`v'
	}
	
estout M* using "$tables\tab7_col5-8_LMimpacts_heterogeneity.txt", ///
cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage) postfoot(Weighted by microregion population; S.E. clustered at microregiao level.) stats(xs fe years ufs N Finsttest, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "Kleibergen-Paap F-test") fmt(%8.0g %8.0g %8.0g %8.3f)) starlevels(* .10 ** .05 *** .01) replace

***********************************************************
** ROBUSTNESS **
***********************************************************
** Figure 1, Figure 2, Figure 3 and Figure 4
** Table 8, Table 10, Table A.10, Table A.11, Table A.12 
***********************************************************

** Figure 1 & Figure 2: Estimates for different distance cut-offs
use "MR_analysis.dta", clear

estimates clear

forv d=100(100)1500{
	ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD`d'_ = l.MIVE_IND`d'_) [aw = population_] , absorb(micregion DYear uf_year) cluster(micregion) nocons
	estadd scalar Finsttest=e(widstat)
	eststo D`d'
	scalar Ftest = e(widstat)
	g Ftest_`d'= Ftest
}	

set scheme sj
	*Figure 1: plot estimates
coefplot D100 D200 D300 D400 D500 D600 D700 D800 D900 D1000 D1100 D1200 D1300 D1400 D1500, ///
		vertical ciopts(recast(rcap) lcolor(black)) /// D1600 D1700 D1800 D1900 D2000
		mcolor(black) m(circle) drop(L.zMIVE l_population_ ) xlabel(#10) xtitle(Distance in 100km) ytitle(Coefficient size and C.I.) legend(off) yscale(range(0 2)) ylabel(#10)
	graph export "$graphs\fig1_Distanceeffect.pdf", replace

	*Figure 2: plot F-test values
keep Ftest_*
keep in 1
g ID = 1
reshape long Ftest_ , i(ID) j(obs)
set scheme sj
scatter Ftest_ obs , yline(10) xtitle(Distance in km) ytitle(Kleibergen-Paap F-test)
	graph export "$graphs\fig2_Ftest_distances.pdf", replace

***********************************************************

* To identify locations with largest Rothemberg weights following Goldsmith-Pinkham et al. (2020) to be used in following analyses run:
use "MR_GPSS_data", clear
set seed 12345

*Define macros
local controls l_population_ zMIVE
local weight population_
local y l_homicrate_
local x l_r_inD_
local z MIVE_INDall
local ind_stub pin
local growth_stub shock
local time_var year
local cluster_var micregion

tsset, clear

drop if micregion == . | l_homicrate_==.
levelsof `time_var', local(years)

forv t=2005/2010 {
	foreach var of varlist `ind_stub'* {
		gen t`t'_`var' = (year == `t') * `var'
		}
	foreach var of varlist `growth_stub'* {
		gen t`t'_`var'b = `var' if year == `t'
		egen t`t'_`var' = max(t`t'_`var'b), by(micregion)
		drop t`t'_`var'b
		}
	}

foreach var of varlist `ind_stub'* {
	if regexm("`var'", "`ind_stub'(.*)") {
		local ind = regexs(1) 
		}
	tempvar temp
	qui gen `temp' = `var' * `growth_stub'`ind'
	qui regress `x' `temp' `controls' [aweight=`weight'], cluster(micregion)
	local pi_`ind' = _b[`temp']
	qui test `temp'
	local F_`ind' = r(F)
	qui regress `y' `temp' `controls' [aweight=`weight'], cluster(micregion)
	local gamma_`ind' = _b[`temp']
	drop `temp'
	}

preserve
keep `ind_stub'* micregion year `weight'
reshape long `ind_stub', i(micregion year) j(ind)
gen `ind_stub'pop = `ind_stub'*`weight'
collapse (sd) `ind_stub'sd = `ind_stub' (rawsum) `ind_stub'pop `weight' [aweight = `weight'], by(ind year)
tempfile tmp
save `tmp'
restore

bartik_weight, z(t*_`ind_stub'*) weightstub(t*_`growth_stub'*) x(`x') y(`y') controls(`controls') weight_var(`weight')

mat beta = r(beta)
mat alpha = r(alpha)
mat gamma = r(gam)
mat pi = r(pi)
mat G = r(G)
qui desc t*_`ind_stub'*, varlist
local varlist = r(varlist)

clear
svmat beta
svmat alpha
svmat gamma
svmat pi
svmat G

gen ind = ""
gen year = ""
local t = 1
foreach var in `varlist' {
	if regexm("`var'", "t(.*)_`ind_stub'(.*)") {
		qui replace year = regexs(1) if _n == `t'
		qui replace ind = regexs(2) if _n == `t'
		}
	local t = `t' + 1
	}

destring ind, replace
destring year, replace
merge 1:1 ind year using `tmp'

collapse (sum) alpha1 , by(ind)

tostring ind, gen(ind_name)

gsort -alpha1

*to get the five origins with highest Rothemberg weights:
list ind alpha1 in 1/5 

*Five origins with highest Rothemberg weights are 23017, 23010, 41030, 31064 and 43019

***********************************************************
**** Figure 3: Plausibility of exogenous shares
* 1) correlates of shares in pre-study period:
use "MR_GPSStests_data", clear

xtset micregion year
keep if year==2003 

*Define macros
local controls l_gdp_ l_averagemonthlywage l_numemp //unemployed_ lhwage_  formal_
local weight population_
local y l_homicrate_
local x l_r_inD_
local ind_stub pin
local growth_stub shock
local time_var year
local cluster_var micregion

tokenize GDP Wages Jobs
local t 1
foreach cont of varlist `controls'{
	preserve
	foreach var in pin23017 pin23010 pin41030 pin31064 pin43019 {
		if regexm("`var'", "pin(.*)") {
			local ind = regexs(1) 
			}
		qui reg `var' `cont' [aweight=population_], cluster(micregion) 
		eststo r`t'_`ind'
		mat c_ = r(table)
		g c_`ind'= c_[4,1]
	}	
	keep c_*
	keep in 1
	g ID = 1
	reshape long c_ , i(ID) j(obs)
	drop ID
		lab var c_ "Origin-specific migration share"
	tempfile cc_`cont'
	g new=_n
	lab define origins 1 "23017" 2 "23010" 3 "31064" 4 "41030" 5 "43019"
	lab values new origins
	scatter c_ new , yline(0.1) xtitle("Origin") xlabel(1 2 3 4 5, nolabel) ytitle(P-value) title("``t''") mlab(obs) saving(`cc_`cont''.gph, replace) legend(off)
	local ++t
	restore
}
       
grc1leg `cc_l_gdp_'.gph `cc_l_averagemonthlywage'.gph `cc_l_numemp'.gph , rows(2) xcom title("Correlations between origin shares and control variables", size(msmall)) subtitle(" ") subtitle(, size(small)) legendfrom(`cc_l_gdp_'.gph) 
	graph export "$graphs\fig3_GPSS_correlates.pdf", replace

***********************************************************
	
**** Figure 4: Check for parallel pre-trends: Regress homicide rate on the initial migration shares and other controls interacted with year dummies for the pre-analysis period 2001-2003
use "$data\MR_GPSStests_data", clear

xtset micregion year
//keep only pre-analysis period:
keep if year<2005 
//update state-year fixed effects to pre-analysis period:
drop uf_year  
egen uf_year=group(uf year) 
//define a pre- and post-period within the pre-analysis period to test for parallel trends
g post=(year==2004)

estimates clear
reghdfe l_homicrate_ i.post##c.(pin23017 pin23010 pin41030 pin31064 pin43019) i.post##c.(zMIVE l_population_) [aw = population_] , absorb(year uf_year) cluster(micregion) 
eststo parallel

coefplot parallel, vertical ciopts(recast(rcap) lcolor(black)) mcolor(black) m(circle) keep(1.post#c.pin23017 1.post#c.pin23010 1.post#c.pin41030 1.post#c.pin31064 1.post#c.pin43019) xtitle(Origin) ytitle(Coefficient size and C.I.) legend(off) yscale(range(0 2)) yline(0) xlabel(1 "23017" 2 "23010" 3 "41030" 4 "31064" 5 "43019")
	graph export "$graphs\fig4_paralleltrends.pdf", replace

***********************************************************

* Table 8: Sensitivity: change weights and change level of employment growth
use "MR_analysis" , clear
xtset micregion year

estimates clear
	** other time period to define weights of IV: 
		*Column 1: whole time period: 2000-2009
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDT_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons 
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_T
		*Column 2: most recent time period 2004-2009
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDt_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_t
		*Column 3: employment growth at regional level
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MRIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_R
  	** column 4: run main estimation without those two origins that show largest Rotemberg weights and are correlated with local characteristics in 2003:
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_rob_) [aw = population_] , absorb(micregion DYear uf_year) cluster(micregion) 
	estadd scalar Ftest=e(F)
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo MIV_rob	
estout M_T M_t M_R MIV_rob using "$tables\tab8_sensitivity.txt", ///
cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage) postfoot(Weighted by microregion population. S.E. clustered at microregiao level.) stats(xs fe years ufs N Finsttest, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "Kleibergen-Paap F-test") fmt(%8.0g %8.0g %8.0f %8.3g)) starlevels(* .10 ** .05 *** .01) replace

***********************************************************
  
*** Table 10: Tests for underidentification and clustering at state level ***
use "MR_analysis" , clear
xtset micregion year

estimates clear

ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) nocons first
	estadd scalar Finsttest=e(widstat)
	estadd scalar AndersonRubinP=e(archi2p) 
	estadd scalar UnderidentificationP=e(idp)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_mr
ivreghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(uf) nocons first
	estadd scalar Finsttest=e(widstat)
	estadd scalar AndersonRubinP=e(archi2p) 
	estadd scalar UnderidentificationP=e(idp)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo M_uf
  **column 3: Alternative estimator: LIML to compare with main estimator:
use "MR_analysis", clear
xtset micregion year
estimates clear
	reghdfe l_homicrate_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year ) cluster(micregion) estimator(liml) old
	eret2 local fe "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	estadd scalar Finsttest=e(widstat)
	eststo M_liml	
  estout M* using "$tables\tabA10_tests_cluster.txt", ///
cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label drop(l_population_ L.zMIVE) title(2SLS Estimation: Second stage) postfoot(Significance levels * 10% ** 5% *** 1%. All estimations are weighted by the Microregi\~{a}o population. All estimates are 2SLS estimates. Controls are the local labor demand shock in t-1 and population. In column 1, S.E. are clustered at Microregi\~{a}o level, in column 2 at state level.) stats(xs fe years ufs N Finsttest AndersonRubinP UnderidentificationP, labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "Kleibergen-Paap F-test" "AR test for instrument redundancy (p-value)" "Underidentification test (p-value)") fmt(%8.0g %8.0g %8.0g %8.0f %8.1f %8.3f %8.3f)) starlevels(* .10 ** .05 *** .01) replace
  
  
***********************************************************
	
** Table 11: control for origin demand shocks by including IV constructed with distance as weight for IV instead of past migration
use "MR_analysis", clear
xtset micregion year

estimates clear
ivreghdfe l_homicrate_ l.MIVE_DIST_ l.zMIVE l_population_ (l.l_r_inD_ = l.MIVE_INDall_) [aw = population_] , absorb(micregion DYear uf_year) cluster(micregion) 
	estadd scalar Ftest=e(F)
	estadd scalar Finsttest=e(widstat)
	eret2 local fe "Yes"
	eret2 local xs "Yes"
	eret2 local years "Yes"
	eret2 local ufs "Yes"
	eststo MIV_Distcontrol

estout M* using "$tables\tab11_distanceweights.txt", cells(b(star  fmt(3) label(Coef.)) se(par fmt(3))) legend label title(2SLS Estimation: Second stage, distance as weight for IV) postfoot(Weighted by microregion population. S.E. clustered at microregiao level.) stats(xs fe years ufs N R2 Finsttest , labels("Controls" "Microregi\~{a}o FEs" "Year FEs" "State-year FEs" "Observations" "R-squared" "Kleibergen-Paap F-test" ) fmt(%8.0f %8.0g %8.0g %8.3f %8.3f )) starlevels(* .10 ** .05 *** .01) replace

	*Test difference:  //code for bootstrap comparison of coefficients comes from Goldsmith-Pinkham et al. (2020)
use "$data\MR_GPSS_data", clear
set seed 12345
discard
local controls1 l_population_ zMIVE
local controls2 l_population_ zMIVE MIVE_DIST_
local weight population_
local y l_homicrate_
local x l_r_inD_
local z MIVE_INDall
local ind_stub pin
local growth_stub shock
local time_var year
local cluster_var micregion

qui tab year, gen(year_)
drop year_1
qui tab micregion, gen(micregion_)
drop micregion_1
qui tab uf_year, g(uf_year_)
drop uf_year_1

levelsof `time_var', local(years)
foreach year in `years' {
	foreach ind_var of varlist `ind_stub'* {
		gen t`year'_`ind_var' = `ind_var' * (year == `year')
	}
}

qui desc t*_`ind_stub'*, varlist
disp wordcount(r(varlist))

set matsize 4000
global controls1 `controls1'
global controls2 `controls2'
global y `y'
global x `x'
global z `z'
global ind_stub `ind_stub'
global growth_stub `growth_stub'
global weight `weight'
tsset, clear

capture program drop test_and_compare
program define test_and_compare, rclass
	btsls, z($z) x($x) y($y) controls($controls1 micregion_* year_* uf_year_*) ktype("tsls") weight_var($weight)
	local b_bar_2sls_1 = r(beta)
	return scalar b_bar_2sls_1 = `b_bar_2sls_1'
	btsls, z($z) x($x) y($y) controls($controls2 micregion_* year_* uf_year_*) ktype("tsls") weight_var($weight)
	local b_bar_2sls_2 = r(beta)
	return scalar b_bar_2sls_2 = `b_bar_2sls_2'
	return scalar diff_bar_2sls = `b_bar_2sls_1' - `b_bar_2sls_2'
end

local n = 200

ivregress 2sls  `y'  `controls1' micregion_* year_* uf_year_*  (`x'= t*_`ind_stub'* )  [aweight=`weight'], vce(robust)
local insts = e(insts)
local cont = e(exogr)
local insts2 ""
/*** Consruct varlist of independent regressors **/
foreach var in `insts' {
	if ~regexm("`cont'", "`var'") & ~regexm("`var'", "o.") & "t2005_pin_11001" != "`var'" & "t2006_pin_11001" != "`var'" & "t2007_pin_11001" != "`var'" & "t2008_pin_11001" != "`var'" & "t2009_pin_11001" != "`var'" & "t2010_pin_11001" != "`var'"{
		local insts2 "`insts2' `var'"
		}
	else {
		disp "`var'"
		}
	}
test_and_compare "`insts2'"
local b1_bartik = string(r(b_bar_2sls_1), "%12.2f")
local b2_bartik = string(r(b_bar_2sls_2), "%12.2f")

bootstrap b1_bartika = r(b_bar_2sls_1) b2_bartika = r(b_bar_2sls_2) diff_bartik = r(diff_bar_2sls), ///  
  cluster(micregion) reps(`n'): test_and_compare  "`insts2'"
mat pval = r(table)
local se1_bartik = "[" + string(pval[2,1], "%12.2f") + "]"
local se2_bartik = "[" + string(pval[2,2], "%12.2f") + "]"
local p_bartik = "[" + string(pval[4,3], "%12.2f") + "]"

capture file close fh
capture erase "$tables/distance_testdifference.tex"
file open fh using "$tables/distance_testdifference.tex", write replace

file write fh "2SLS (Bartik) & `b1_bartik' & `b2_bartik' & `p_bartik' & \\" _n
file write fh "    & `se1_bartik' & `se2_bartik' &         & \\" _n

file close fh


***********************************************************
**** Table 12: Misspecification and overidentification (and column 4 of table A.10 LIML using individual shares as instruments)

use "MR_GPSS_data", clear
set seed 12345

local controls l_population_ zMIVE
local weight population_
local y l_homicrate_
local x l_r_inD_
local z MIVE_INDall
local ind_stub pin
local growth_stub shock
local time_var year
local cluster_var micregion

qui tab year, gen(year_)
drop year_1
qui tab micregion, gen(micregion_)
drop micregion_1
qui tab uf_year, g(uf_year_)
drop uf_year_1

levelsof `time_var', local(years)
foreach year in `years' {
	foreach ind_var of varlist `ind_stub'* {
		gen t`year'_`ind_var' = `ind_var' * (year == `year')
	}
}

qui desc t*_`ind_stub'*, varlist
disp wordcount(r(varlist))

set matsize 4000
global controls `controls'
global y `y'
global x `x'
global z `z'
global ind_stub `ind_stub'
global growth_stub `growth_stub'
global weight `weight'
tsset, clear

capture program drop test_and_compare
program define test_and_compare, rclass
	btsls, z($z) x($x) y($y) controls(micregion_* year_* uf_year_*) ktype("tsls") weight_var($weight)
	local b_bar_2sls_1 = r(beta)
	return scalar b_bar_2sls_1 = `b_bar_2sls_1'
	btsls, z($z) x($x) y($y) controls($controls micregion_* year_* uf_year_*) ktype("tsls") weight_var($weight)
	local b_bar_2sls_2 = r(beta)
	return scalar b_bar_2sls_2 = `b_bar_2sls_2'
	return scalar diff_bar_2sls = `b_bar_2sls_1' - `b_bar_2sls_2'
	btsls, z("`1'") x($x) y($y) controls(micregion_* year_* uf_year_*) ktype("mbtsls") weight_var($weight)
	local b_sh_mbtsls_1 = r(beta)
	return scalar b_sh_mbtsls_1 = `b_sh_mbtsls_1'
	btsls, z("`1'") x($x) y($y) controls($controls micregion_* year_* uf_year_*) ktype("mbtsls") weight_var($weight)
	local b_sh_mbtsls_2 = r(beta)
	return scalar b_sh_mbtsls_2 = `b_sh_mbtsls_2'
	return scalar diff_sh_mbtsls = `b_sh_mbtsls_1' - `b_sh_mbtsls_2'
end


capture program drop test_and_compare_chao
program define test_and_compare_chao, rclass
	overid_chao, z("`1'") x($x) y($y) controls(micregion_* year_* uf_year_*)  weight_var($weight)
	local b_1 = r(delta)
	return scalar b_1 = `b_1'
	overid_chao, z("`1'") x($x) y($y) controls($controls micregion_* year_* uf_year_*)  weight_var($weight)
	local b_2 = r(delta)
	return scalar b_2 = `b_2'
	return scalar diff = `b_1' - `b_2'
end

local n = 200

estimates clear

ivregress 2sls  `y'  `controls' micregion_* year_* uf_year_*  (`x'= t*_`ind_stub'* )  [aweight=`weight'], vce(robust)
local insts = e(insts)
local cont = e(exogr)
local insts2 ""
/*** Consruct varlist of independent regressors **/
foreach var in `insts' {
	if ~regexm("`cont'", "`var'") & ~regexm("`var'", "o.") & "t2005_pin_11001" != "`var'" & "t2006_pin_11001" != "`var'" & "t2007_pin_11001" != "`var'" & "t2008_pin_11001" != "`var'" & "t2009_pin_11001" != "`var'" & "t2010_pin_11001" != "`var'"{
		local insts2 "`insts2' `var'"
		}
	else {
		disp "`var'"
		}
	}

test_and_compare "`insts2'"
local b1_bartik = string(r(b_bar_2sls_1), "%12.2f")
local b2_bartik = string(r(b_bar_2sls_2), "%12.2f")
local b1_mbtsls = string(r(b_sh_mbtsls_1), "%12.2f")
local b2_mbtsls = string(r(b_sh_mbtsls_2), "%12.2f")

bootstrap b1_bartika = r(b_bar_2sls_1) b2_bartika = r(b_bar_2sls_2) diff_bartik = r(diff_bar_2sls) ///
    b1_mbtslsa = r(b_sh_mbtsls_1)   b2_mbtslsa = r(b_sh_mbtsls_2) diff_mbtslsa = r(diff_sh_mbtsls), ///  
  cluster(micregion) reps(`n'): test_and_compare  "`insts2'"
mat pval = r(table)
local se1_bartik = "[" + string(pval[2,1], "%12.2f") + "]"
local se2_bartik = "[" + string(pval[2,2], "%12.2f") + "]"
local p_bartik = "[" + string(pval[4,3], "%12.2f") + "]"
local se1_mbtsls = "[" + string(pval[2,4], "%12.2f") + "]"
local se2_mbtsls = "[" + string(pval[2,5], "%12.2f") + "]"
local p_mbtsls = "[" + string(pval[4,6], "%12.2f") + "]"

qui ivregress liml `y'  (`x' =  `insts2') micregion_* year_* uf_year_* [aw=`weight'], cluster(micregion)
local b1_liml = string(_b[`x'], "%12.2f")
local se1_liml = "(" + string(_se[`x'], "%12.2f") + ")"
qui ivregress liml `y'  (`x' =  `insts2') `controls' micregion_* year_* uf_year_* [aw=`weight'], cluster(micregion)
local b2_liml = string(_b[`x'], "%12.2f")
local se2_liml = "(" + string(_se[`x'], "%12.2f") + ")"
local N = e(N)
local K = wordcount(e(insts)) - wordcount(e(exogr))
local L = wordcount(e(exogr))
local kappa = e(kappa) - 1
local J_liml  = (`N' - `K' - `L') * (`kappa' - 1)
local alpha_L =  `L' / `N'
local alpha_K =  `K' / `N'
local crit = normal(sqrt((1 - `alpha_L') / ( 1- `alpha_K' - `alpha_L'))*invnorm(0.95)) 
disp chi2(`J_liml', `K'-1)
local Jp_liml = "[" + string(`=chi2(`J_liml', `K'-1)', "%12.2f") + "]"
local J_liml = string(`J_liml', "%12.2f")

capture file close fh
capture erase "$tables/overidtests.tex"
file open fh using "$tables/overidtests.tex", write replace

file write fh "2SLS & `b1_bartik' & `b2_bartik' & `p_bartik' & \\" _n
file write fh "    & `se1_bartik' & `se2_bartik' &         & \\" _n
file write fh "MBTSLS & `b1_mbtsls' & `b2_mbtsls' & `p_mbtsls' & \\" _n
file write fh "    & `se1_mbtsls'& `se2_mbtsls'&         & \\" _n
file write fh "LIML & `b1_liml' & `b2_liml' & `p_liml' & `J_liml'\\" _n
file write fh "    & `se1_liml'& `se2_liml'&         & `Jp_liml' \\" _n

file close fh
	

	
	
*done*
